##########################################
# Refugee Exposure, Elections, Development
# Afrobarometer Data Clean
# Mary 24, 2021
##########################################

## Clean afrobarometer data and calculate refugee exposure
sf::sf_use_s2(FALSE)

library(tidyverse)
library(rgdal)
library(raster)
library(sf)
library(doMC)
library(haven)
library(estimatr)
library(geosphere)
library(readxl)

## Define all outcome vars other than
## migrants_movement_standardized. NA so that
## we use the 2016 AB round instead of 2015
outcome_vars <- c("migrants_attitude_standardized",
                  "feel_unsafe_standardized",
                  "feared_crime_standardized",
                  "presidential_approval_standardized",
                  "partisanship_nrm_standardized", 
                  "trust_president_standardized", 
                  "trust_rulling_party_standardized",
                  "born_non_ugandans_standardized",
                  "foreigner_residents_standardized",
                  "wealth_index", 
                  "gvmt_perf_index")

# ab_data_orig <- read_csv("ab_final_v4_geocoded_with_index.csv") %>%
#   mutate(key = paste0(year, respno))

ab_data <- read.csv("ab_final_v5_geocoded_with_index.csv") %>%
  mutate(key = paste0(year, respno))
ab_data <- ab_data %>%
  mutate(across(outcome_vars, ~case_when(year == 2015~NA_real_, TRUE~.x)))
# ab_data <- ab_data[match(ab_data_orig$key, ab_data$key),]

## ---------------
## Load shapefiles
## ---------------
refsites <- readOGR(path.expand("ugakaimug"), "settlement_boundaries")
refsites_kitgum_okollo <- readOGR(path.expand("Kitgum and Okollo"), "Boundaries_Kitgum_Okollo")
refsites_kitgum_okollo <- spTransform(refsites_kitgum_okollo, crs(refsites))

## Clean up refsites names
refsites@data$Name_setlm <- as.character(refsites@data$Name_setlm)
refsites@data$Name_setlm[4] <- "Oliji I"
refsites@data$Name_setlm[5] <- "Oliji II"
refsites@data$Name_setlm[17] <- "Maaji 1A"
refsites@data$Name_setlm[18] <- "Maaji 1B"
refsites@data$Name_setlm[30] <- "Palorinya I"
refsites@data$Name_setlm[31] <- "Palorinya II"
refsites@data$Name_setlm[32] <- "Palorinya III"
refsites@data$Name_setlm[33] <- "Palorinya IV"
refsites@data$Name_setlm[34] <- "Palorinya V"
refsites@data$Name_setlm[35] <- "Palorinya VI"
refsites@data$Name_setlm[36] <- "Palorinya VII"
refsites@data$Name_setlm[37] <- "Palorinya VIII"
refsites@data$Name_setlm[38] <- "Palorinya IX"
refsites@data$Name_setlm[39] <- "Palorinya X"

## -------------------------------
## Get distance to nearest refsite
## -------------------------------
refsites_sf <- st_as_sf(refsites)
refsites_ko_sf <- st_as_sf(refsites_kitgum_okollo)
ab_sf <- st_as_sf(ab_data, coords = c("longitude", "latitude"), crs = st_crs(refsites_sf))

# ## WARNING:
# ##     TAKES HOURS TO RUN
# registerDoMC(detectCores()-1)
# refsites_dist <- foreach(i = 1:nrow(ab_sf), .combine = "rbind") %dopar% {
#     out <- st_distance(ab_sf[i,], refsites_sf)
#     if(i %% 100 == 0){
#         print(paste0("Done with ", i, " parishes at ", Sys.time(), "."))
#     }
#     return(out/1000)
# }
# write_csv(as.data.frame(refsites_dist), path = "ab_distance_to_refsite.csv")

refsites_dist <- read_csv("ab_distance_to_refsite.csv")

# ## Add in kitgum and okollo
# registerDoMC(detectCores()-1)
# refsites_ko_dist <- foreach(i = 1:nrow(ab_sf), .combine = "rbind") %dopar% {
#     out <- st_distance(ab_sf[i,], refsites_ko_sf)
#     if(i %% 100 == 0){
#         print(paste0("Done with ", i, " parishes at ", Sys.time(), "."))
#     }
#     return(out/1000)
# }
# write_csv(as.data.frame(refsites_ko_dist), path = "ab_distance_to_refsite_kitgum_okollo.csv")

refsites_ko_dist <- read_csv("ab_distance_to_refsite_kitgum_okollo.csv")

## Put together
refsites_dist <- bind_cols(refsites_dist, refsites_ko_dist)
colnames(refsites_dist) <- paste(
  c(refsites@data$Name_setlm, as.character(refsites_kitgum_okollo@data$Name)),
  "Distance")

## ----------------------
## Get distance to border
## ----------------------
uganda_boundary <- readOGR(path.expand("Uganda_countryboundaries_adm0"), 
                           "uga_admbnda_adm0_UBOS_v2")

# ## WARNING:
# ##     TAKES HOURS TO RUN
# ## Calculate distance
# registerDoMC(detectCores()-1)
# border_dist <- foreach(i = 1:nrow(ab_data), .combine = "rbind") %dopar% {
#     ab_sp <- SpatialPoints(ab_data[i,c("longitude", "latitude")], proj4string = crs(ab_sf))
#     out <- dist2Line(ab_sp, uganda_boundary)
#     if(i %% 100 == 0){
#         print(paste0("Done with ", i, " parishes at ", Sys.time(), "."))
#     }
#     return(out[,1]/1000)
# }
# write_csv(as.data.frame(border_dist), path = "ab_distance_to_border.csv")

border_dist <- read_csv("ab_distance_to_border.csv")
colnames(border_dist) <- "borderdist"

## --------------------
## Get distance to road
## --------------------
uganda_road <- readOGR(path.expand("Uganda_roads_feb2009"), "Uganda_Roads_Feb2009")
uganda_road <- spTransform(uganda_road, crs(ab_sf))
uganda_road_sf <- st_as_sf(uganda_road)

# ## WARNING:
# ##     TAKES HOURS TO RUN
# ## Calculate distance
# registerDoMC(detectCores()-1)
# road_dist <- foreach(i = 1:nrow(ab_sf), .combine = "rbind") %dopar% {
#     out <- st_distance(ab_sf[i,], uganda_road_sf)
#     if(i %% 100 == 0){
#         print(paste0("Done with ", i, " parishes at ", Sys.time(), "."))
#     }
#     return(min(as.numeric(out), na.rm = TRUE)/1000)
# }
# write_csv(as.data.frame(road_dist), path = "ab_distance_to_road.csv")

road_dist <- read_csv("ab_distance_to_road.csv")
colnames(road_dist) <- "roaddist"

## -----------------------
## Get distance to capital
## -----------------------
kampala_coord <- st_point(x = c(32.595242, 0.310841)) %>%
  st_sfc(crs = st_crs(ab_sf))

registerDoMC(detectCores()-1)
cap_dist <- foreach(i = 1:nrow(ab_sf), .combine = "rbind") %dopar% {
    out <- st_distance(ab_sf[i,], kampala_coord)
    if(i %% 100 == 0){
        print(paste0("Done with ", i, " parishes at ", Sys.time(), "."))
    }
    return(out[1,1]/1000)
}

cap_dist <- as.data.frame(cap_dist)
colnames(cap_dist) <- "capitoldist"

## -------------------
## Merge onto AB data
## -------------------
## D = District
## C = County
## S = Sub-county

ab_data <- bind_cols(ab_data, refsites_dist) %>%
  bind_cols(border_dist) %>%
  bind_cols(road_dist) %>%
  bind_cols(cap_dist) %>%
  mutate(P_02_ID = as.character(P_02_ID))

## -------------------------
## Get min distance variable
## -------------------------
ab_data <- ab_data %>%
  mutate(
    min_distance = pmap_dbl(
      .l = dplyr::select(., ends_with(" Distance")),
      .f = function(...) min(...)
    )
  )

## ----------------------------
## Change the years to match AB
## ----------------------------

## In 2015 there is one question we want, for all others we want 2016
## Migrant movement

ab_data <- ab_data %>%
  mutate(year = case_when(year == 2005~2001,
                          year == 2008~2006,
                          year == 2011~2011, 
                          year == 2015~2016,
                          year == 2016~2016))

## --------------------------------
## Merge in the populations by year
## --------------------------------
refsites_pop <- read_csv("uga_refsites_population_final_analysis.csv")
refsites_pop <- refsites_pop %>%
  dplyr::select(Name_setlm, Year, `Refugee Population`)

## Check names
all_sites <- colnames(refsites_dist)
all_sites <- gsub(" Distance", "", all_sites)

all(refsites_pop$Name_setlm %in% all_sites)
all(all_sites %in% refsites_pop$Name_setlm)

## Widen
refsites_pop <- refsites_pop %>% 
  pivot_wider(names_from = Name_setlm, values_from = `Refugee Population`) %>%
  mutate(Year = Year + 1)
names(refsites_pop) <- c("Year", paste(names(refsites_pop)[-1], "Population"))

## Final data merge
ab_data <- ab_data %>% 
  left_join(refsites_pop, by = c("year" = "Year"))
nrow(ab_data)

## --------------------
## Fix nearest distance
## --------------------
nd_df <- inner_join(
  ab_data %>%
    dplyr::select(round, respno, ends_with(" Distance")) %>%
    pivot_longer(cols = ends_with(" Distance"), names_to = "camp", values_to = "distance") %>%
    mutate(camp = gsub(" Distance", "", camp)),
  ab_data %>%
    dplyr::select(round, respno, ends_with(" Population")) %>%
    pivot_longer(cols = ends_with(" Population"), names_to = "camp", values_to = "population") %>%
    mutate(camp = gsub(" Population", "", camp))
)

min_distance_fixed <- nd_df %>%
  filter(population > 0) %>%
  group_by(round, respno) %>%
  summarize(min_distance = min(distance))

ab_data <- ab_data %>%
  dplyr::select(-min_distance) %>%
  left_join(min_distance_fixed)

## -------------------
## Create new measures
## -------------------
exposure_measure_df <- ab_data %>% 
  dplyr::select(round, respno, ends_with(" Distance"), ends_with(" Population"))

## Make long df
exposure_measure_df <- inner_join(
  exposure_measure_df %>% dplyr::select(-ends_with(" Population")) %>%
    pivot_longer(cols = ends_with(" Distance"), names_to = "camp", values_to = "distance") %>%
    mutate(camp = gsub(" Distance", "", camp)),
  exposure_measure_df %>% dplyr::select(-ends_with(" Distance")) %>%
    pivot_longer(cols = ends_with(" Population"), names_to = "camp", values_to = "population") %>%
    mutate(camp = gsub(" Population", "", camp))
)

## Make measures - all camps
exposure_measure_df_o_all <- exposure_measure_df %>%
  mutate(exposure = population / (distance + 1),
         exposure_ln = log(exposure)) %>%
  group_by(round, respno) %>%
  filter(population > 0) %>%
  mutate(sum_exposure_20km_rad = coalesce(sum(exposure[distance < 20]), 0),
         sum_exposure_50km_rad = coalesce(sum(exposure[distance < 50]), 0),
         sum_exposure_100km_rad = coalesce(sum(exposure[distance < 100]), 0),
         avg_all_exposure_20 = coalesce(mean(exposure[distance < 20]), 0),
         avg_all_exposure_50 = coalesce(mean(exposure[distance < 50]), 0),
         avg_all_exposure_100 = coalesce(mean(exposure[distance < 100]), 0),
         avg_all_exposure_150 = coalesce(mean(exposure[distance < 150]), 0),
         avg_all_exposure_200 = coalesce(mean(exposure[distance < 200]), 0),
         avg_all_exposure_full = coalesce(mean(exposure), 0),
         avg_all_exposure_ln_100 = log(avg_all_exposure_100),
         avg_all_exposure_ln_150 = log(avg_all_exposure_150),
         avg_all_exposure_ln_200 = log(avg_all_exposure_200),
         avg_all_exposure_ln_full = log(avg_all_exposure_full),
         across(starts_with("avg_all_exposure_ln"), ~case_when(is.infinite(.x)~NA_real_, TRUE~.x)),
         across(starts_with("avg_all_exposure_ln"), ~coalesce(.x, 0))) %>%
  filter(distance == min(distance)) %>% ## Now filter to smallest distnace
  filter(population == max(population)) %>% ## To break any remaining ties, take the largest settlement
  rename(nearest_exposure = exposure, nearest_exposure_ln = exposure_ln) %>%
  dplyr::select(-c(camp, distance, population, starts_with("D_02_ID"))) %>% 
  distinct()

## Merge back to data, rename ID for easier use
ab_data <- ab_data %>% 
  inner_join(exposure_measure_df_o_all)
nrow(ab_data)

## Output merged dataset
write_csv(ab_data, path = "ab_data_merged.csv")
